Small RNAseq: Differential Expression Analysis

Downloading datasets

Raw data

Raw data was downloaded from the sequencing facility using the secure link, with wget command. The downloaded files were checked for md5sum and compared against list of files expected as per the input samples provided.

wget https://oc1.rnet.missouri.edu/xyxz
# link masked 
# GEO link will be included later
# merge files of same samples (technical replicates)
paste <(ls *_L001_R1_001.fastq.gz) <(ls *_L002_R1_001.fastq.gz) | \
   sed 's/\t/ /g' |\
   awk '{print "cat",$1,$2" > "$1}' |\
   sed 's/_L001_R1_001.fastq.gz/.fq.gz/2' > concatenate.sh
chmod +x concatenate.sh
sh concatenate.sh

Genome/annotation

Additional files required for the analyses were downloaded from GenCode. The downloaded files are as follows:

wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M30/GRCm39.primary_assembly.genome.fa.gz
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M30/gencode.vM30.annotation.gff3.gz
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M30/gencode.vM30.annotation.gtf.gz
gunzip GRCm39.primary_assembly.genome.fa.gz
gunzip gencode.vM30.annotation.gff3.gz
gunzip gencode.vM30.annotation.gtf.gz

QC (bfore processing)

salloc -N 1 --exclusive -p amd -t 8:00:00
conda activate smallrna
for fq in *.fq.gz; do
  fastqc --threads $SLURM_JOB_CPUS_PER_NODE $fq;
done
mkdir -p fastqc_pre
mv *.zip *.html fastqc_pre/

Mapping

To index the genome, following command was run (in an interactive session).

fastaGenome="GRCm39.genome.fa"
gtf="gencode.vM30.annotation.gtf"
STAR --runThreadN $SLURM_JOB_CPUS_PER_NODE \
     --runMode genomeGenerate \
     --genomeDir $(pwd) \
     --genomeFastaFiles $fastaGenome \
     --sjdbGTFfile $gtf \
     --sjdbOverhang 1

Each fastq file was mapped to the indexed genome as using runSTAR_map.sh script shown below:

#!/bin/bash
read1=$1
STARgenomeDir=$(pwd)
# illumina adapter
adapterseq="AGATCGGAAGAGC"
STAR \
    --genomeDir ${STARgenomeDir} \
    --readFilesIn ${read1} \
    --outSAMunmapped Within \
    --readFilesCommand zcat \
    --outSAMtype BAM SortedByCoordinate \
    --quantMode GeneCounts \
    --outFilterMultimapNmax 20 \
    --clip3pAdapterSeq ${adapterseq} \
    --clip3pAdapterMMp 0.1 \
    --outFilterMismatchNoverLmax 0.03 \
    --outFilterScoreMinOverLread 0 \
    --outFilterMatchNminOverLread 0 \
    --outFilterMatchNmin 16 \
    --alignSJDBoverhangMin 1000 \ 
    --alignIntronMax 1 \
    --runThreadN ${SLURM_JOB_CPUS_PER_NODE} \
    --genomeLoad LoadAndKeep \
    --limitBAMsortRAM 30000000000 \
    --outSAMheaderHD "@HD VN:1.4 SO:coordinate"

Mapping was run with a simple loop:

for fq in *.fq.gz; do
  runSTAR_map.sh $fq;
done

Counting Stats

library(tidyverse)
library(scales)
setwd("/work/LAS/geetu-lab/arnstrm/smRNAseq.ShortTitle")
file1="assets/processed_counts_star.tsv"
file2="assets/summary_stats_star.tsv"
counts <-
  read.csv(
    file1,
    sep = "\t",
    stringsAsFactors = TRUE
  )
subread <-
  read.csv(
    file2,
    sep = "\t",
    stringsAsFactors = TRUE
  )
# convert long format
counts.long <- gather(counts, Sample, Count, Dif_D6_1_S4:Undif_D2_4_S5, factor_key=TRUE)
subread.long <- gather(subread, Sample,  Count, Dif_D6_1_S4:Undif_D2_4_S5, factor_key=TRUE)
# organize
counts.long$Group <-
  factor(
    counts.long$Group,
    levels = c(
      "coding_genes",
      "non_conding_RNA",
      "long_non_conding_RNA",
      "pseudogenes",
      "unconfirmed_genes",
      "Ig_genes"
    )
  )

subread.long$Assignments <-
  factor(
    subread.long$Assignments,
    levels = c(
      "N_input",
      "N_unmapped",
      "N_multimapping",
      "N_unique",
      "N_ambiguous",
      "N_noFeature"
    )
  )
ggplot(subread.long, aes(x = Assignments, y = Count, fill = Assignments)) +
  geom_bar(stat = 'identity') +
  labs(x = "Subread assingments", y = "reads") + theme_minimal() +
  scale_y_continuous(labels = label_comma()) +
  theme(
    axis.text.x = element_text(
      angle = 45,
      vjust = 1,
      hjust = 1,
      size = 12
      ),
    strip.text = element_text(
      face = "bold",
      color = "gray35",
      hjust = 0,
      size = 10
    ),
    strip.background = element_rect(fill = "white", linetype = "blank"),
    legend.position = "none"
  ) +
  facet_wrap("Sample", scales = "free_y", ncol = 4) 
STAR read mapping and feature assignment

STAR read mapping and feature assignment

ggplot(counts.long, aes(x = Group, y = Count, fill = GeneType)) +
  geom_bar(stat = 'sum') +
  labs(x = "gene type", y = "read counts") + theme_minimal() +
  scale_y_continuous(labels = label_comma()) +
  theme(
    axis.text.x = element_text(
      angle = 45,
      vjust = 1,
      hjust = 1,
      size = 12
    ),
    strip.text = element_text(
      face = "bold",
      color = "gray35",
      hjust = 0,
      size = 10
    ),
    strip.background = element_rect(fill = "white", linetype = "blank"),
    legend.position = "none"
  ) +
  facet_wrap("Sample", scales = "free_y", ncol = 4)
Features with read counts

Features with read counts

DESeq2

For the next steps, we used DESeq2 for performing the DE analyses. Results were visualized as volcano plots and tables were exported to excel.

Load packages

setwd("/work/LAS/geetu-lab/arnstrm/smRNAseq.mm10")
library(DESeq2)
library(RColorBrewer)
library(plotly)
library(pheatmap)
library(genefilter)
library(ggrepel)

Import counts and sample metadata

The counts data and its associated metadata (coldata) are imported for analyses.

counts = 'assets/noncoding_counts_star.tsv'
groupFile = 'assets/samples.tsv'
coldata <-
  read.csv(
    groupFile,
    row.names = 1,
    sep = "\t",
    stringsAsFactors = TRUE
  )
cts <- as.matrix(read.csv(counts, sep = "\t", row.names = "Geneid"))

Inspect the coldata.

DT::datatable(coldata)

Reorder columns of cts according to coldata rows. Check if samples in both files match.

colnames(cts)
#> [1] "Dif_D6_1_S4"   "Dif_D6_2_S3"   "Dif_D6_3_S2"   "Dif_D6_4_S1"  
#> [5] "Undif_D2_1_S8" "Undif_D2_2_S7" "Undif_D2_3_S6" "Undif_D2_4_S5"
all(rownames(coldata) %in% colnames(cts))
#> [1] TRUE
cts <- cts[, rownames(coldata)]

Normalize

The batch corrected read counts are then used for running DESeq2 analyses

dds <- DESeqDataSetFromMatrix(countData = cts,
                              colData = coldata,
                              design = ~ Group)
vsd <- vst(dds, blind = FALSE, nsub =500)
keep <- rowSums(counts(dds)) >= 5
dds <- dds[keep, ]
dds <- DESeq(dds)
dds
#> class: DESeqDataSet 
#> dim: 1426 8 
#> metadata(1): version
#> assays(4): counts mu H cooks
#> rownames(1426): ENSMUSG00000119106.1 ENSMUSG00000119589.1 ...
#>   ENSMUSG00000065444.3 ENSMUSG00000077869.3
#> rowData names(22): baseMean baseVar ... deviance maxCooks
#> colnames(8): Dif_D6_1_S4 Dif_D6_2_S3 ... Undif_D2_3_S6 Undif_D2_4_S5
#> colData names(2): Group sizeFactor
vst <- assay(vst(dds, blind = FALSE, nsub = 500))
vsd <- vst(dds, blind = FALSE, nsub = 500)
pcaData <-
  plotPCA(vsd,
          intgroup = "Group",
          returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

PCA plot for QC

PCA plot for the dataset that includes all libraries.

rv <- rowVars(assay(vsd))
select <-
  order(rv, decreasing = TRUE)[seq_len(min(500, length(rv)))]
pca <- prcomp(t(assay(vsd)[select, ]))
percentVar <- pca$sdev ^ 2 / sum(pca$sdev ^ 2)
intgroup = "Group"
intgroup.df <- as.data.frame(colData(vsd)[, intgroup, drop = FALSE])
group <- if (length(intgroup) == 1) {
  factor(apply(intgroup.df, 1, paste, collapse = " : "))
}
d <- data.frame(
  PC1 = pca$x[, 1],
  PC2 = pca$x[, 2],
  intgroup.df,
  name = colnames(vsd)
)

plot PCA for components 1 and 2

g <- ggplot(d, aes(PC1, PC2, color = Group)) +
  scale_shape_manual(values = 1:8) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  geom_point(size = 2, stroke = 2) +
  xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
  ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))
ggplotly(g)

PCA plot for the first 2 principal components

Sample distance for QC

sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- colnames(vsd)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colors)
Euclidean distance between samples

Euclidean distance between samples

Set contrasts and find DE genes

resultsNames(dds)
#> [1] "Intercept"          "Group_Undf_vs_Diff"
res.UndfvsDiff <- results(dds, contrast = c("Group", "Undf", "Diff"))
table(res.UndfvsDiff$padj < 0.05)
#> 
#> FALSE  TRUE 
#>   624   332
res.UndfvsDiff <- res.UndfvsDiff[order(res.UndfvsDiff$padj),]
res.UndfvsDiffdata <-
  merge(
    as.data.frame(res.UndfvsDiff),
    as.data.frame(counts(dds, normalized = TRUE)),
    by = "row.names",
    sort = FALSE
  )
names(res.UndfvsDiffdata)[1] <- "Gene"
write_delim(res.UndfvsDiffdata, file = "DESeq2results-UndfvsDiff_fc.tsv", delim = "\t")

Volcano plots

mart <-
  read.csv(
    "assets/mart_export.txt",
    sep = "\t",
    stringsAsFactors = TRUE,
    header = TRUE
  ) #this object was obtained from Ensembl as we illustrated in "Creating gene lists"
volcanoPlots2 <-
  function(res.se,
           string,
           first,
           second,
           color1,
           color2,
           color3,
           ChartTitle) {
    res.se <- res.se[order(res.se$padj), ]
    res.se <-
      rownames_to_column(as.data.frame(res.se[order(res.se$padj), ]))
    names(res.se)[1] <- "Gene"
    res.data <-
      merge(res.se,
            mart,
            by.x = "Gene",
            by.y = "geneid.version")
    res.data <- res.data %>% mutate_all(na_if, "")
    res.data <- res.data %>% mutate_all(na_if, " ")
    res.data <-
      res.data %>% mutate(gene_symbol = coalesce(gene.symbol, Gene))
    res.data$diffexpressed <- "other.genes"
    res.data$diffexpressed[res.data$log2FoldChange >= 1 &
                             res.data$padj <= 0.05] <-
      paste("Higher expression in", first)
    res.data$diffexpressed[res.data$log2FoldChange <= -1 &
                             res.data$padj <= 0.05] <-
      paste("Higher expression in", second)
    res.data$delabel <- ""
    res.data$delabel[res.data$log2FoldChange >= 1
                     & res.data$padj <= 0.05
                     &
                       !is.na(res.data$padj)] <-
      res.data$gene_symbol[res.data$log2FoldChange >= 1
                           &
                             res.data$padj <= 0.05
                           &
                             !is.na(res.data$padj)]
    res.data$delabel[res.data$log2FoldChange <= -1
                     & res.data$padj <= 0.05
                     &
                       !is.na(res.data$padj)] <-
      res.data$gene_symbol[res.data$log2FoldChange <= -1
                           &
                             res.data$padj <= 0.05
                           &
                             !is.na(res.data$padj)]
    ggplot(res.data,
             aes(
               x = log2FoldChange,
               y = -log10(padj),
               col = diffexpressed,
               label = delabel
             )) +
      geom_point(alpha = 0.5) +
      xlim(-20, 20) +
      theme_classic() +
      scale_color_manual(name = "Expression", values = c(color1, color2, color3)) +
      geom_text_repel(
        data = subset(res.data, padj <= 0.05),
        max.overlaps  = 15,
        show.legend = F,
        min.segment.length = Inf,
        seed = 42,
        box.padding = 0.5
      ) +
      ggtitle(ChartTitle) +
      xlab(paste("log2 fold change")) +
      ylab("-log10 pvalue (adjusted)") +
      theme(legend.text.align = 0)
}
volcanoPlots2(
  res.UndfvsDiff,
  "UndfvsDiff",
  "Undf",
  "Diff",
  "green",
  "blue",
  "grey",
  ChartTitle = "Undifferenciated vs. Differenciated"
)
#> Warning: Removed 470 rows containing missing values (geom_point).
#> Warning: ggrepel: 316 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps
Undifferenciated vs. Differenciated

Undifferenciated vs. Differenciated

Heatmap

Heatmap for the top 30 variable genes:

topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 30)
mat  <- assay(vsd)[ topVarGenes, ]
mat  <- mat - rowMeans(mat)
mat2 <-    merge(mat,
          mart,
          by.x = 'row.names',
          by.y = "geneid.version")
rownames(mat2) <- mat2[,10]
mat2 <- mat2[2:9]
heat_colors <- brewer.pal(9, "YlOrRd")
g <- pheatmap(
      mat2,
      color = heat_colors,
      main = "Top 30 variable smRNA/lncRNA genes",
      cluster_rows = F,
      cluster_cols  = T,
      show_rownames = T,
      border_color = NA,
      fontsize = 10,
      scale = "row",
      fontsize_row = 10
    )
    g
Heat map for top 30 variable genes

Heat map for top 30 variable genes